{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "# import modules\n",
    "import pandas\n",
    "from sklearn import svm\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "from sklearn.decomposition import PCA\n",
    "from pylab import *\n",
    "import struct\n",
    "import keras as ks\n",
    "import logging\n",
    "from keras.layers import Dense, Activation, Flatten, Convolution2D\n",
    "from keras.utils import np_utils\n",
    "from keras.models import model_from_json\n",
    "from keras import backend as K\n",
    "\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from skimage import io\n",
    "import numpy as np\n",
    "from PIL import Image \n",
    "from scipy import misc\n",
    "import os\n",
    "# ..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据读取和预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 3888 * 51 with 4 labels\n",
    "data_raw = pandas.read_csv(\"data/host10280-labeled.csv\")\n",
    "data = np.array(data_raw)\n",
    "data_raw = pandas.read_csv(\"data/host10274-labeled.csv\")\n",
    "data = np.concatenate((data,np.array(data_raw)),axis=0)\n",
    "data_raw = pandas.read_csv(\"data/host10283-labeled.csv\")\n",
    "data = np.concatenate((data,np.array(data_raw)),axis=0)\n",
    "\n",
    "# 51 = 2 ids + 45 features + 4 labels\n",
    "# ids: host + clock\n",
    "# labels: normal, cpu, mem, io\n",
    "data_features = data[:,2:47]\n",
    "data_labels = data[:,47:51]\n",
    "\n",
    "data_class = [] # 0 ~ 3\n",
    "for label in data_labels:\n",
    "    data_class.append(np.dot([0,1,2,3],label))\n",
    "data_class = np.array(data_class,dtype=\"int\")\n",
    "# cpu: 180\n",
    "# mem: 180\n",
    "# io : 180\n",
    "# normal: 3348\n",
    "\n",
    "# select 180 normal samples randomly\n",
    "data_tmp = data_features[data_class == 0]\n",
    "index_tmp = np.arange(0,data_tmp.shape[0],data_tmp.shape[0]/180)[0:180]\n",
    "data_normal_tmp = data_tmp[index_tmp]\n",
    "\n",
    "# features : 720(180*4) * 45\n",
    "# labels   : 720(180*4) * 4\n",
    "# class    : 720(180*4) * 1\n",
    "data_features = np.concatenate((data_normal_tmp,data_features[data_class != 0]),axis=0)\n",
    "data_labels = np.concatenate((np.array([[1,0,0,0]]*180),data_labels[data_class != 0]),axis=0)\n",
    "data_class = np.concatenate((np.array([0]*180),data_class[data_class != 0]),axis=0)\n",
    "\n",
    "# feature scaling\n",
    "for i in range(data_features.shape[1]):\n",
    "    d_min = data_features[:,i].min()\n",
    "    d_max = data_features[:,i].max()\n",
    "    if d_min == d_max:\n",
    "        data_features[:,i] = 1\n",
    "        continue\n",
    "    data_features[:,i] -= d_min\n",
    "    data_features[:,i] /= (d_max - d_min)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 划分训练集和数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(576, 45)\n",
      "(576, 4)\n",
      "(576,)\n",
      "(144, 45)\n",
      "(144, 4)\n",
      "(144,)\n"
     ]
    }
   ],
   "source": [
    "index_test = np.arange(0,720,5)\n",
    "index_train = np.array(list(set(np.arange(0,720,1)) - set(index_test)))\n",
    "data_train = data_features[index_train]\n",
    "labels_train = data_labels[index_train]\n",
    "class_train = data_class[index_train]\n",
    "data_test = data_features[index_test]\n",
    "labels_test = data_labels[index_test]\n",
    "class_test = data_class[index_test]\n",
    "print(data_train.shape)\n",
    "print(labels_train.shape)\n",
    "print(class_train.shape)\n",
    "print(data_test.shape)\n",
    "print(labels_test.shape)\n",
    "print(class_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.954861111111\n",
      "0.916666666667\n"
     ]
    }
   ],
   "source": [
    "clf = svm.SVC()\n",
    "clf.fit(data_train, class_train)\n",
    "print(clf.score(data_train, class_train))\n",
    "print(clf.score(data_test, class_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 神经网络\n",
    "45 => 128 => 64 => 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /Users/lyq/Anaconda/anaconda2/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:2755: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "keep_dims is deprecated, use keepdims instead\n",
      "WARNING:tensorflow:From /Users/lyq/Anaconda/anaconda2/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:1290: calling reduce_mean (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "keep_dims is deprecated, use keepdims instead\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lyq/Anaconda/anaconda2/lib/python2.7/site-packages/keras/models.py:848: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n",
      "  warnings.warn('The `nb_epoch` argument in `fit` '\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 576 samples, validate on 144 samples\n",
      "Epoch 1/50\n",
      "576/576 [==============================] - 0s - loss: 1.3437 - acc: 0.3663 - val_loss: 1.2766 - val_acc: 0.4653\n",
      "Epoch 2/50\n",
      "576/576 [==============================] - 0s - loss: 1.2359 - acc: 0.5434 - val_loss: 1.1807 - val_acc: 0.7361\n",
      "Epoch 3/50\n",
      "576/576 [==============================] - 0s - loss: 1.1240 - acc: 0.8229 - val_loss: 1.0658 - val_acc: 0.8819\n",
      "Epoch 4/50\n",
      "576/576 [==============================] - 0s - loss: 1.0118 - acc: 0.9010 - val_loss: 0.9642 - val_acc: 0.9167\n",
      "Epoch 5/50\n",
      "576/576 [==============================] - 0s - loss: 0.8990 - acc: 0.9549 - val_loss: 0.8561 - val_acc: 0.9167\n",
      "Epoch 6/50\n",
      "576/576 [==============================] - 0s - loss: 0.7953 - acc: 0.9323 - val_loss: 0.7583 - val_acc: 0.9167\n",
      "Epoch 7/50\n",
      "576/576 [==============================] - 0s - loss: 0.6850 - acc: 0.9497 - val_loss: 0.6568 - val_acc: 0.9375\n",
      "Epoch 8/50\n",
      "576/576 [==============================] - 0s - loss: 0.5865 - acc: 0.9601 - val_loss: 0.5704 - val_acc: 0.9306\n",
      "Epoch 9/50\n",
      "576/576 [==============================] - 0s - loss: 0.4945 - acc: 0.9549 - val_loss: 0.4948 - val_acc: 0.9375\n",
      "Epoch 10/50\n",
      "576/576 [==============================] - 0s - loss: 0.4221 - acc: 0.9531 - val_loss: 0.4356 - val_acc: 0.9375\n",
      "Epoch 11/50\n",
      "576/576 [==============================] - 0s - loss: 0.3634 - acc: 0.9566 - val_loss: 0.3896 - val_acc: 0.9444\n",
      "Epoch 12/50\n",
      "576/576 [==============================] - 0s - loss: 0.3137 - acc: 0.9549 - val_loss: 0.3596 - val_acc: 0.9236\n",
      "Epoch 13/50\n",
      "576/576 [==============================] - 0s - loss: 0.2781 - acc: 0.9514 - val_loss: 0.3246 - val_acc: 0.9236\n",
      "Epoch 14/50\n",
      "576/576 [==============================] - 0s - loss: 0.2466 - acc: 0.9566 - val_loss: 0.3053 - val_acc: 0.9167\n",
      "Epoch 15/50\n",
      "576/576 [==============================] - 0s - loss: 0.2234 - acc: 0.9549 - val_loss: 0.2922 - val_acc: 0.9167\n",
      "Epoch 16/50\n",
      "576/576 [==============================] - 0s - loss: 0.2091 - acc: 0.9549 - val_loss: 0.2759 - val_acc: 0.9375\n",
      "Epoch 17/50\n",
      "576/576 [==============================] - 0s - loss: 0.1901 - acc: 0.9583 - val_loss: 0.2696 - val_acc: 0.9167\n",
      "Epoch 18/50\n",
      "576/576 [==============================] - 0s - loss: 0.1791 - acc: 0.9583 - val_loss: 0.2672 - val_acc: 0.9167\n",
      "Epoch 19/50\n",
      "576/576 [==============================] - 0s - loss: 0.1740 - acc: 0.9583 - val_loss: 0.2512 - val_acc: 0.9306\n",
      "Epoch 20/50\n",
      "576/576 [==============================] - 0s - loss: 0.1649 - acc: 0.9601 - val_loss: 0.2481 - val_acc: 0.9236\n",
      "Epoch 21/50\n",
      "576/576 [==============================] - 0s - loss: 0.1534 - acc: 0.9601 - val_loss: 0.2397 - val_acc: 0.9375\n",
      "Epoch 22/50\n",
      "576/576 [==============================] - 0s - loss: 0.1473 - acc: 0.9618 - val_loss: 0.2471 - val_acc: 0.9167\n",
      "Epoch 23/50\n",
      "576/576 [==============================] - 0s - loss: 0.1411 - acc: 0.9618 - val_loss: 0.2379 - val_acc: 0.9375\n",
      "Epoch 24/50\n",
      "576/576 [==============================] - 0s - loss: 0.1356 - acc: 0.9583 - val_loss: 0.2296 - val_acc: 0.9375\n",
      "Epoch 25/50\n",
      "576/576 [==============================] - 0s - loss: 0.1354 - acc: 0.9670 - val_loss: 0.2384 - val_acc: 0.9375\n",
      "Epoch 26/50\n",
      "576/576 [==============================] - 0s - loss: 0.1309 - acc: 0.9601 - val_loss: 0.2309 - val_acc: 0.9375\n",
      "Epoch 27/50\n",
      "576/576 [==============================] - 0s - loss: 0.1239 - acc: 0.9635 - val_loss: 0.2259 - val_acc: 0.9375\n",
      "Epoch 28/50\n",
      "576/576 [==============================] - 0s - loss: 0.1241 - acc: 0.9653 - val_loss: 0.2307 - val_acc: 0.9375\n",
      "Epoch 29/50\n",
      "576/576 [==============================] - 0s - loss: 0.1207 - acc: 0.9635 - val_loss: 0.2262 - val_acc: 0.9375\n",
      "Epoch 30/50\n",
      "576/576 [==============================] - 0s - loss: 0.1167 - acc: 0.9653 - val_loss: 0.2310 - val_acc: 0.9306\n",
      "Epoch 31/50\n",
      "576/576 [==============================] - 0s - loss: 0.1173 - acc: 0.9653 - val_loss: 0.2259 - val_acc: 0.9375\n",
      "Epoch 32/50\n",
      "576/576 [==============================] - 0s - loss: 0.1120 - acc: 0.9635 - val_loss: 0.2294 - val_acc: 0.9375\n",
      "Epoch 33/50\n",
      "576/576 [==============================] - 0s - loss: 0.1133 - acc: 0.9635 - val_loss: 0.2232 - val_acc: 0.9375\n",
      "Epoch 34/50\n",
      "576/576 [==============================] - 0s - loss: 0.1089 - acc: 0.9688 - val_loss: 0.2230 - val_acc: 0.9375\n",
      "Epoch 35/50\n",
      "576/576 [==============================] - 0s - loss: 0.1076 - acc: 0.9670 - val_loss: 0.2242 - val_acc: 0.9375\n",
      "Epoch 36/50\n",
      "576/576 [==============================] - 0s - loss: 0.1054 - acc: 0.9688 - val_loss: 0.2211 - val_acc: 0.9375\n",
      "Epoch 37/50\n",
      "576/576 [==============================] - 0s - loss: 0.1046 - acc: 0.9670 - val_loss: 0.2202 - val_acc: 0.9444\n",
      "Epoch 38/50\n",
      "576/576 [==============================] - ETA: 0s - loss: 0.0529 - acc: 1.000 - 0s - loss: 0.1096 - acc: 0.9670 - val_loss: 0.2380 - val_acc: 0.9306\n",
      "Epoch 39/50\n",
      "576/576 [==============================] - 0s - loss: 0.1003 - acc: 0.9635 - val_loss: 0.2203 - val_acc: 0.9375\n",
      "Epoch 40/50\n",
      "576/576 [==============================] - 0s - loss: 0.0979 - acc: 0.9722 - val_loss: 0.2318 - val_acc: 0.9375\n",
      "Epoch 41/50\n",
      "576/576 [==============================] - 0s - loss: 0.0998 - acc: 0.9705 - val_loss: 0.2258 - val_acc: 0.9444\n",
      "Epoch 42/50\n",
      "576/576 [==============================] - 0s - loss: 0.1007 - acc: 0.9722 - val_loss: 0.2220 - val_acc: 0.9444\n",
      "Epoch 43/50\n",
      "576/576 [==============================] - 0s - loss: 0.0943 - acc: 0.9705 - val_loss: 0.2256 - val_acc: 0.9444\n",
      "Epoch 44/50\n",
      "576/576 [==============================] - 0s - loss: 0.0969 - acc: 0.9722 - val_loss: 0.2221 - val_acc: 0.9444\n",
      "Epoch 45/50\n",
      "576/576 [==============================] - 0s - loss: 0.0954 - acc: 0.9722 - val_loss: 0.2205 - val_acc: 0.9444\n",
      "Epoch 46/50\n",
      "576/576 [==============================] - 0s - loss: 0.0935 - acc: 0.9740 - val_loss: 0.2236 - val_acc: 0.9444\n",
      "Epoch 47/50\n",
      "576/576 [==============================] - 0s - loss: 0.0925 - acc: 0.9705 - val_loss: 0.2237 - val_acc: 0.9444\n",
      "Epoch 48/50\n",
      "576/576 [==============================] - 0s - loss: 0.0932 - acc: 0.9722 - val_loss: 0.2251 - val_acc: 0.9444\n",
      "Epoch 49/50\n",
      "576/576 [==============================] - 0s - loss: 0.0912 - acc: 0.9740 - val_loss: 0.2335 - val_acc: 0.9375\n",
      "Epoch 50/50\n",
      "576/576 [==============================] - 0s - loss: 0.0888 - acc: 0.9705 - val_loss: 0.2323 - val_acc: 0.9444\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x1c23f42a90>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = ks.models.Sequential()\n",
    "model.add(Dense(128, input_dim=data_features.shape[1]))\n",
    "model.add(Activation('relu'))\n",
    "model.add(Dense(64))\n",
    "model.add(Activation('relu'))\n",
    "model.add(Dense(4))\n",
    "model.add(Activation('softmax'))\n",
    "\n",
    "model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])\n",
    "model.fit(x=data_train,y=labels_train,batch_size=100,nb_epoch=50,verbose=1,validation_data=(data_test,labels_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 神经网络 + SVM\n",
    "45 => 128 => 64 => SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.972222222222\n",
      "0.9375\n"
     ]
    }
   ],
   "source": [
    "get_feature = K.function([model.layers[0].input],[model.layers[2].output])\n",
    "mid_features_train = get_feature([data_train])[0]\n",
    "mid_features_test  = get_feature([data_test])[0]\n",
    "clf = svm.SVC()\n",
    "clf.fit(mid_features_train, class_train)\n",
    "print(clf.score(mid_features_train, class_train))\n",
    "print(clf.score(mid_features_test, class_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 其他机器学习算法\n",
    "- KNN\n",
    "- 线性SVM\n",
    "- 基于核函数的SVM (RBF SVM), Radial basis function kernel\n",
    "- 决策树\n",
    "- 随机森林"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nearest Neighbors\n",
      "train: 0.991319444444\n",
      "test: 0.9375\n",
      "Linear SVM\n",
      "train: 0.949652777778\n",
      "test: 0.916666666667\n",
      "RBF SVM\n",
      "train: 0.954861111111\n",
      "test: 0.916666666667\n",
      "Decision Tree\n",
      "train: 0.996527777778\n",
      "test: 0.9375\n",
      "Random Forest\n",
      "train: 0.993055555556\n",
      "test: 0.9375\n"
     ]
    }
   ],
   "source": [
    "classifiers = [\n",
    "    KNeighborsClassifier(3),\n",
    "    svm.SVC(kernel=\"linear\", C=0.025),\n",
    "    svm.SVC(),\n",
    "    DecisionTreeClassifier(max_depth=5),\n",
    "    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)\n",
    "    ]\n",
    "names = [\"Nearest Neighbors\", \"Linear SVM\", \"RBF SVM\", \"Decision Tree\",\n",
    "         \"Random Forest\"]\n",
    "for name, clf in zip(names,classifiers):\n",
    "    clf.fit(data_train,class_train)\n",
    "    print(name)\n",
    "    print(\"train: {}\".format(clf.score(data_train, class_train)))\n",
    "    print(\"test: {}\".format(clf.score(data_test, class_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.89499915  0.06281427  0.03863219  0.00355432]\n",
      "[ 0.89310551  0.05416692  0.04868099  0.00404656]\n",
      "[ 0.89843506  0.05573963  0.04200708  0.00381828]\n",
      "[ 0.89838797  0.05358611  0.04370584  0.00432   ]\n",
      "[ 0.10541886  0.00658347  0.0701194   0.81787819]\n",
      "[ 0.8179487   0.04564025  0.1304263   0.00598465]\n",
      "[ 0.8330397   0.0987699   0.04428348  0.02390696]\n",
      "[ 0.23016171  0.75411671  0.01306443  0.00265717]\n",
      "[ 0.44072691  0.04287855  0.51088846  0.00550609]\n",
      "[ 0.28720668  0.69722545  0.00956561  0.0060023 ]\n",
      "[ 0.10918946  0.00907711  0.01240001  0.86933339]\n",
      "[ 0.35860217  0.62206674  0.00860362  0.0107275 ]\n",
      "[ 0.08568739  0.01609712  0.88738692  0.01082858]\n"
     ]
    }
   ],
   "source": [
    "for res in model.predict(data_test):\n",
    "    if res[0] < 0.9 and res[1] < 0.9 and res[2] < 0.9 and res[3] < 0.9:\n",
    "        print(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
